library(tidyverse)
library(readr)
library(boot)
library(table1)
library(gridExtra)
library(MASS)
library(car)
library(dplyr)
library(leaps)
library(corrplot)
breastcancer_data =
read_csv("Project_2_data.csv") |>
janitor::clean_names()
## Rows: 4024 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Race, Marital Status, T Stage, N Stage, 6th Stage, differentiate, ...
## dbl (5): Age, Tumor Size, Regional Node Examined, Reginol Node Positive, Su...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(breastcancer_data)
## age race marital_status t_stage
## Min. :30.00 Length:4024 Length:4024 Length:4024
## 1st Qu.:47.00 Class :character Class :character Class :character
## Median :54.00 Mode :character Mode :character Mode :character
## Mean :53.97
## 3rd Qu.:61.00
## Max. :69.00
## n_stage x6th_stage differentiate grade
## Length:4024 Length:4024 Length:4024 Length:4024
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## a_stage tumor_size estrogen_status progesterone_status
## Length:4024 Min. : 1.00 Length:4024 Length:4024
## Class :character 1st Qu.: 16.00 Class :character Class :character
## Mode :character Median : 25.00 Mode :character Mode :character
## Mean : 30.47
## 3rd Qu.: 38.00
## Max. :140.00
## regional_node_examined reginol_node_positive survival_months
## Min. : 1.00 Min. : 1.000 Min. : 1.0
## 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.: 56.0
## Median :14.00 Median : 2.000 Median : 73.0
## Mean :14.36 Mean : 4.158 Mean : 71.3
## 3rd Qu.:19.00 3rd Qu.: 5.000 3rd Qu.: 90.0
## Max. :61.00 Max. :46.000 Max. :107.0
## status
## Length:4024
## Class :character
## Mode :character
##
##
##
bc = breastcancer_data |>
mutate(
race=case_when(
race == "White" ~ 1,
race == "Black" ~ 2,
race == "Other" ~ 3),
marital_status=case_when(
marital_status == "Married" ~ 1,
marital_status == "Divorced" ~ 2,
marital_status == "Single" ~ 3,
marital_status == "Widowed" ~ 4,
marital_status == "Separated" ~ 5),
t_stage=case_when(
t_stage == "T1" ~ 1,
t_stage == "T2" ~ 2,
t_stage == "T3" ~ 3,
t_stage == "T4" ~ 4),
n_stage=case_when(
n_stage == "N1" ~ 1,
n_stage == "N2" ~ 2,
n_stage == "N3" ~ 3),
x6th_stage=case_when(
x6th_stage == "IIA" ~ 1,
x6th_stage == "IIIA" ~ 2,
x6th_stage == "IIIC" ~ 3,
x6th_stage == "IIB" ~ 4,
x6th_stage == "IIIB" ~ 5),
differentiate=case_when(
differentiate == "Poorly differentiated" ~ 1,
differentiate == "Moderately differentiated" ~ 2,
differentiate == "Well differentiated" ~ 3,
differentiate == "Undifferentiated" ~ 4),
grade=case_when(
grade == "1" ~ 1,
grade == "2" ~ 2,
grade == "3" ~ 3,
grade == "anaplastic; Grade IV" ~ 4),
a_stage=case_when(
a_stage == "Regional" ~ 1,
a_stage == "Distant" ~ 0),
estrogen_status=case_when(
estrogen_status == "Positive" ~ 1,
estrogen_status == "Negative" ~ 0),
progesterone_status=case_when(
progesterone_status == "Positive" ~ 1,
progesterone_status == "Negative" ~ 0),
status=case_when(
status == "Alive" ~ 1,
status == "Dead" ~ 0)
)
summary(bc)
## age race marital_status t_stage
## Min. :30.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:47.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :54.00 Median :1.000 Median :1.000 Median :2.000
## Mean :53.97 Mean :1.231 Mean :1.646 Mean :1.785
## 3rd Qu.:61.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :69.00 Max. :3.000 Max. :5.000 Max. :4.000
## n_stage x6th_stage differentiate grade
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :1.000 Median :2.000 Median :2.000 Median :2.000
## Mean :1.438 Mean :2.405 Mean :1.868 Mean :2.151
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :3.000 Max. :5.000 Max. :4.000 Max. :4.000
## a_stage tumor_size estrogen_status progesterone_status
## Min. :0.0000 Min. : 1.00 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.: 16.00 1st Qu.:1.0000 1st Qu.:1.0000
## Median :1.0000 Median : 25.00 Median :1.0000 Median :1.0000
## Mean :0.9771 Mean : 30.47 Mean :0.9332 Mean :0.8265
## 3rd Qu.:1.0000 3rd Qu.: 38.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :140.00 Max. :1.0000 Max. :1.0000
## regional_node_examined reginol_node_positive survival_months status
## Min. : 1.00 Min. : 1.000 Min. : 1.0 Min. :0.0000
## 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.: 56.0 1st Qu.:1.0000
## Median :14.00 Median : 2.000 Median : 73.0 Median :1.0000
## Mean :14.36 Mean : 4.158 Mean : 71.3 Mean :0.8469
## 3rd Qu.:19.00 3rd Qu.: 5.000 3rd Qu.: 90.0 3rd Qu.:1.0000
## Max. :61.00 Max. :46.000 Max. :107.0 Max. :1.0000
We change race, marital_status, t_stage, n_stage, x6th_stage, differentiate, and grade into multiple numeric levels, while a_stage, estrogen_status, progesterone_status, and status to binary levels. The above variables are categorical variables.
And age, tumor_size, regional_node_examined, reginol_node_positive, and survival_months are numeric variables.
plot(bc)
cor(bc) |>
knitr::kable(digits=4,caption="Correlation for all variables")
| age | race | marital_status | t_stage | n_stage | x6th_stage | differentiate | grade | a_stage | tumor_size | estrogen_status | progesterone_status | regional_node_examined | reginol_node_positive | survival_months | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 1.0000 | -0.0970 | 0.0755 | -0.0669 | 0.0029 | -0.0450 | 0.0932 | -0.0993 | 0.0209 | -0.0772 | 0.0598 | -0.0213 | -0.0333 | 0.0126 | -0.0094 | -0.0559 |
| race | -0.0970 | 1.0000 | 0.0349 | 0.0036 | 0.0191 | 0.0173 | -0.0336 | 0.0301 | -0.0020 | 0.0071 | -0.0425 | -0.0209 | 0.0113 | 0.0090 | -0.0025 | 0.0042 |
| marital_status | 0.0755 | 0.0349 | 1.0000 | 0.0257 | 0.0457 | 0.0221 | -0.0117 | 0.0206 | -0.0174 | 0.0207 | -0.0198 | -0.0357 | 0.0077 | 0.0443 | -0.0487 | -0.0731 |
| t_stage | -0.0669 | 0.0036 | 0.0257 | 1.0000 | 0.2770 | 0.5637 | -0.1102 | 0.1315 | -0.2211 | 0.8092 | -0.0610 | -0.0576 | 0.1141 | 0.2431 | -0.0857 | -0.1547 |
| n_stage | 0.0029 | 0.0191 | 0.0457 | 0.2770 | 1.0000 | 0.0939 | -0.1488 | 0.1625 | -0.2606 | 0.2779 | -0.1020 | -0.0937 | 0.3283 | 0.8381 | -0.1396 | -0.2558 |
| x6th_stage | -0.0450 | 0.0173 | 0.0221 | 0.5637 | 0.0939 | 1.0000 | -0.0999 | 0.0972 | -0.1372 | 0.3034 | -0.0417 | -0.0309 | 0.0826 | 0.1427 | -0.0536 | -0.0919 |
| differentiate | 0.0932 | -0.0336 | -0.0117 | -0.1102 | -0.1488 | -0.0999 | 1.0000 | -0.9083 | 0.0437 | -0.0995 | 0.1868 | 0.1758 | -0.0834 | -0.1229 | 0.0584 | 0.1342 |
| grade | -0.0993 | 0.0301 | 0.0206 | 0.1315 | 0.1625 | 0.0972 | -0.9083 | 1.0000 | -0.0395 | 0.1194 | -0.2113 | -0.1799 | 0.0844 | 0.1353 | -0.0677 | -0.1614 |
| a_stage | 0.0209 | -0.0020 | -0.0174 | -0.2211 | -0.2606 | -0.1372 | 0.0437 | -0.0395 | 1.0000 | -0.1239 | 0.0656 | 0.0265 | -0.0690 | -0.2328 | 0.0701 | 0.0966 |
| tumor_size | -0.0772 | 0.0071 | 0.0207 | 0.8092 | 0.2779 | 0.3034 | -0.0995 | 0.1194 | -0.1239 | 1.0000 | -0.0596 | -0.0699 | 0.1044 | 0.2423 | -0.0869 | -0.1342 |
| estrogen_status | 0.0598 | -0.0425 | -0.0198 | -0.0610 | -0.1020 | -0.0417 | 0.1868 | -0.2113 | 0.0656 | -0.0596 | 1.0000 | 0.5133 | -0.0448 | -0.0860 | 0.1285 | 0.1847 |
| progesterone_status | -0.0213 | -0.0209 | -0.0357 | -0.0576 | -0.0937 | -0.0309 | 0.1758 | -0.1799 | 0.0265 | -0.0699 | 0.5133 | 1.0000 | -0.0181 | -0.0781 | 0.0960 | 0.1771 |
| regional_node_examined | -0.0333 | 0.0113 | 0.0077 | 0.1141 | 0.3283 | 0.0826 | -0.0834 | 0.0844 | -0.0690 | 0.1044 | -0.0448 | -0.0181 | 1.0000 | 0.4116 | -0.0221 | -0.0348 |
| reginol_node_positive | 0.0126 | 0.0090 | 0.0443 | 0.2431 | 0.8381 | 0.1427 | -0.1229 | 0.1353 | -0.2328 | 0.2423 | -0.0860 | -0.0781 | 0.4116 | 1.0000 | -0.1352 | -0.2566 |
| survival_months | -0.0094 | -0.0025 | -0.0487 | -0.0857 | -0.1396 | -0.0536 | 0.0584 | -0.0677 | 0.0701 | -0.0869 | 0.1285 | 0.0960 | -0.0221 | -0.1352 | 1.0000 | 0.4765 |
| status | -0.0559 | 0.0042 | -0.0731 | -0.1547 | -0.2558 | -0.0919 | 0.1342 | -0.1614 | 0.0966 | -0.1342 | 0.1847 | 0.1771 | -0.0348 | -0.2566 | 0.4765 | 1.0000 |
corrplot(cor(bc), method = "square", type = "upper", diag = FALSE)
plot1age =
breastcancer_data|>
ggplot(aes(x = age)) +
geom_histogram(color = "blue", fill = alpha("blue", 0.5), binwidth = 5) +
theme_minimal() +
labs(
title = "Age Distribution",
x = "Age",
y = "Frequency"
)
plot1age
plot2race =
breastcancer_data|>
ggplot(aes(x = race)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "Race Distribution",
x = "Race",
y = "Frequency"
)
plot2race
plot3marital =
breastcancer_data|>
ggplot(aes(x = marital_status)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "Marital Status Distribution",
x = "Marital Status",
y = "Frequency"
)
plot3marital
plot4tstage =
breastcancer_data|>
ggplot(aes(x = t_stage)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "T Stage Distribution",
x = "T Stage",
y = "Frequency"
)
plot4tstage
plot5nstage =
breastcancer_data|>
ggplot(aes(x = n_stage)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "N Stage Distribution",
x = "N Stage",
y = "Frequency"
)
plot5nstage
plot6x6thstage =
breastcancer_data|>
ggplot(aes(x = x6th_stage)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "x6th Stage Distribution",
x = "x6th Stage",
y = "Frequency"
)
plot6x6thstage
plot7differentate =
breastcancer_data|>
ggplot(aes(x = differentiate)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "Differentate Distribution",
x = "Differentate Group",
y = "Frequency"
)
plot7differentate
plot8grade =
breastcancer_data|>
ggplot(aes(x = grade)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "Grade Distribution",
x = "Grade",
y = "Frequency"
)
plot8grade
plot9astage =
breastcancer_data|>
ggplot(aes(x = a_stage)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "A_stage Distribution",
x = "A Stage",
y = "Frequency"
)
plot9astage
plot10tumorsize =
breastcancer_data|>
ggplot(aes(x = tumor_size)) +
geom_histogram(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "Tumor Size Distribution",
x = "Tumor Size",
y = "Frequency"
)
plot10tumorsize
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot11estrogen =
breastcancer_data|>
ggplot(aes(x = estrogen_status)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "Estrogen Status Distribution",
x = "Estrogen Status",
y = "Frequency"
)
plot11estrogen
plot12progesterone =
breastcancer_data|>
ggplot(aes(x = progesterone_status)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "Progesterone Status Distribution",
x = "Progesterone Status",
y = "Frequency"
)
plot12progesterone
plot13nodeexamined =
breastcancer_data|>
ggplot(aes(x = regional_node_examined)) +
geom_histogram(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "Regional Node Examined Distribution",
x = "Examined Regional Node",
y = "Frequency"
)
plot13nodeexamined
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot14nodepositive =
breastcancer_data|>
ggplot(aes(x = reginol_node_positive)) +
geom_histogram(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "Regional Node Positive Distribution",
x = "Positive Reginol Node",
y = "Frequency"
)
plot14nodepositive
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Y1 = survive months; (numeric)
Y2 = status; (binary)
plot15survivalmonths =
breastcancer_data|>
ggplot(aes(x = survival_months)) +
geom_histogram(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "Survival Months",
x = "Number of survival months",
y = "Frequency"
)
plot15survivalmonths
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot16status =
bc|>
ggplot(aes(x = status)) +
geom_bar(color = "blue", fill = alpha("blue", 0.5))+
theme_minimal() +
labs(
title = "status",
x = "status",
y = "Frequency"
)
plot16status
grid.arrange(plot1age, plot10tumorsize, plot13nodeexamined,
plot14nodepositive, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We can see that age is approximately normal, while tumor size, regional node examined, and regional node positive are skewed.
grid.arrange(plot2race, plot3marital,plot4tstage,
plot5nstage, plot6x6thstage,plot7differentate,
plot8grade,plot9astage,plot11estrogen,plot12progesterone, ncol = 3)
We know that the tumor size, regional node examined, and regional node positive are skewed. We should do transformation on these variables. Before the transformation, we can use the Box-Cox plot to check which transformation work the best for them.
bc_transform_tumorsize <- boxcox(breastcancer_data$tumor_size ~ 1, lambda = seq(-2, 2, by=0.1))
bc_transform_regionalnode_examined <- boxcox(breastcancer_data$regional_node_examined ~ 1, lambda = seq(-2, 2, by=0.1))
bc_transform_regionalnode_pos <- boxcox(breastcancer_data$reginol_node_positive ~ 1, lambda = seq(-2, 2, by=0.1))
The lambda value of tumor size is close to 0, so we should use log transformation, while the lambda value of regional node examined is around 0.5, we should take a square root to the value, and the lambda value of regional node positive is around -0.5, so we should take a take square root and take an (-1) exponent for transformation.
par(mfrow = c(2, 2))
hist(log(bc$tumor_size))
hist(sqrt(bc$regional_node_examined))
hist(bc$reginol_node_positive**(-0.5))
We can see that tumor size and regional node examined become approximately normal after log transformation, while the regional node positive is still extremely skewed. Therefore, we may consider not using the variable of reginol_node_positive.
newbc1 = bc |>
mutate(ln_tumor=log(tumor_size),
sqrt_examined=sqrt(regional_node_examined)) |>
dplyr::select(-tumor_size) |>
dplyr::select(-regional_node_examined) |>
dplyr::select(-status)
newbc1
## # A tibble: 4,024 × 15
## age race marital_status t_stage n_stage x6th_stage differentiate grade
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 68 1 1 1 1 1 1 3
## 2 50 1 1 2 2 2 2 2
## 3 58 1 2 3 3 3 2 2
## 4 58 1 1 1 1 1 1 3
## 5 47 1 1 2 1 4 1 3
## 6 51 1 3 1 1 1 2 2
## 7 51 1 1 1 1 1 3 1
## 8 40 1 1 2 1 4 2 2
## 9 40 1 2 4 3 3 1 3
## 10 69 1 1 4 3 3 3 1
## # ℹ 4,014 more rows
## # ℹ 7 more variables: a_stage <dbl>, estrogen_status <dbl>,
## # progesterone_status <dbl>, reginol_node_positive <dbl>,
## # survival_months <dbl>, ln_tumor <dbl>, sqrt_examined <dbl>
newbc2 = bc |>
mutate(ln_tumor=log(tumor_size),
sqrt_examined=sqrt(regional_node_examined)) |>
dplyr::select(-tumor_size) |>
dplyr::select(-regional_node_examined) |>
dplyr::select(-survival_months)
newbc2
## # A tibble: 4,024 × 15
## age race marital_status t_stage n_stage x6th_stage differentiate grade
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 68 1 1 1 1 1 1 3
## 2 50 1 1 2 2 2 2 2
## 3 58 1 2 3 3 3 2 2
## 4 58 1 1 1 1 1 1 3
## 5 47 1 1 2 1 4 1 3
## 6 51 1 3 1 1 1 2 2
## 7 51 1 1 1 1 1 3 1
## 8 40 1 1 2 1 4 2 2
## 9 40 1 2 4 3 3 1 3
## 10 69 1 1 4 3 3 3 1
## # ℹ 4,014 more rows
## # ℹ 7 more variables: a_stage <dbl>, estrogen_status <dbl>,
## # progesterone_status <dbl>, reginol_node_positive <dbl>, status <dbl>,
## # ln_tumor <dbl>, sqrt_examined <dbl>
# indicator test when y is status
categorical_vars <- c("race", "marital_status", "t_stage", "n_stage", "x6th_stage",
"differentiate", "grade", "a_stage",
"estrogen_status", "progesterone_status")
newbc2[categorical_vars] <- lapply(newbc2[categorical_vars], factor)
formula <- as.formula("status ~ race + marital_status + t_stage + n_stage + x6th_stage +
differentiate + grade + a_stage + estrogen_status + progesterone_status+ln_tumor + sqrt_examined+ reginol_node_positive+ age")
model <- glm(formula, data = newbc2, family = binomial())
summary(model)
##
## Call:
## glm(formula = formula, family = binomial(), data = newbc2)
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.826084 0.591231 3.089 0.00201 **
## race2 -0.517829 0.161866 -3.199 0.00138 **
## race3 0.412988 0.202323 2.041 0.04123 *
## marital_status2 -0.208737 0.141798 -1.472 0.14100
## marital_status3 -0.137065 0.134860 -1.016 0.30946
## marital_status4 -0.226852 0.192395 -1.179 0.23836
## marital_status5 -0.870139 0.369406 -2.356 0.01850 *
## t_stage2 -0.241275 0.214882 -1.123 0.26151
## t_stage3 -0.463184 0.308144 -1.503 0.13280
## t_stage4 -0.911657 0.451201 -2.021 0.04333 *
## n_stage2 -0.652606 0.238058 -2.741 0.00612 **
## n_stage3 -0.757283 0.301072 -2.515 0.01189 *
## x6th_stage2 0.069464 0.294083 0.236 0.81327
## x6th_stage3 NA NA NA NA
## x6th_stage4 -0.226947 0.231875 -0.979 0.32771
## x6th_stage5 -0.085418 0.528445 -0.162 0.87159
## differentiate2 0.387813 0.104972 3.694 0.00022 ***
## differentiate3 0.922456 0.193026 4.779 1.76e-06 ***
## differentiate4 -0.970582 0.533726 -1.819 0.06899 .
## grade2 NA NA NA NA
## grade3 NA NA NA NA
## grade4 NA NA NA NA
## a_stage1 0.044840 0.266049 0.169 0.86616
## estrogen_status1 0.733142 0.177768 4.124 3.72e-05 ***
## progesterone_status1 0.589919 0.127619 4.623 3.79e-06 ***
## ln_tumor -0.053874 0.138931 -0.388 0.69818
## sqrt_examined 0.256137 0.049748 5.149 2.62e-07 ***
## reginol_node_positive -0.074309 0.015040 -4.941 7.78e-07 ***
## age -0.023985 0.005625 -4.264 2.01e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3444.7 on 4023 degrees of freedom
## Residual deviance: 2951.6 on 3999 degrees of freedom
## AIC: 3001.6
##
## Number of Fisher Scoring iterations: 5
Based on the above indicator test summary, we delete grade and x6th_stage because their output was NA in the output linear model, since NA indicates these predicts may contribute collinearity.
# indicator test when y is Survival Months
categorical_vars <- c("race", "marital_status", "t_stage", "n_stage", "x6th_stage",
"differentiate", "grade", "a_stage",
"estrogen_status", "progesterone_status")
newbc1[categorical_vars] <- lapply(newbc1[categorical_vars], factor)
formula1 <- as.formula("survival_months ~ race + marital_status + t_stage + n_stage + x6th_stage +
differentiate + grade + a_stage + estrogen_status + progesterone_status+ln_tumor + sqrt_examined+ reginol_node_positive+ age")
model1 <- lm(formula1, data = newbc1)
summary(model1)
##
## Call:
## lm(formula = formula1, data = newbc1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.557 -15.564 1.209 18.142 56.447
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.92531 4.78147 13.160 < 2e-16 ***
## race2 -3.75385 1.40240 -2.677 0.00746 **
## race3 1.90888 1.32316 1.443 0.14919
## marital_status2 -0.71596 1.11597 -0.642 0.52119
## marital_status3 -0.70625 1.02666 -0.688 0.49155
## marital_status4 -1.48374 1.57047 -0.945 0.34483
## marital_status5 -6.94522 3.40072 -2.042 0.04119 *
## t_stage2 -1.71846 1.78646 -0.962 0.33614
## t_stage3 -0.79270 2.51886 -0.315 0.75300
## t_stage4 -3.38505 4.42219 -0.765 0.44404
## n_stage2 -0.92483 1.97717 -0.468 0.63999
## n_stage3 -3.46678 2.67103 -1.298 0.19439
## x6th_stage2 -0.41077 2.35318 -0.175 0.86143
## x6th_stage3 NA NA NA NA
## x6th_stage4 0.57306 1.82486 0.314 0.75351
## x6th_stage5 3.64333 5.15554 0.707 0.47980
## differentiate2 0.99136 0.85199 1.164 0.24466
## differentiate3 0.93301 1.22642 0.761 0.44685
## differentiate4 -1.88819 5.23011 -0.361 0.71810
## grade2 NA NA NA NA
## grade3 NA NA NA NA
## grade4 NA NA NA NA
## a_stage1 4.36402 2.67234 1.633 0.10254
## estrogen_status1 8.62044 1.68608 5.113 3.32e-07 ***
## progesterone_status1 1.64605 1.10093 1.495 0.13496
## ln_tumor -1.10747 1.02996 -1.075 0.28233
## sqrt_examined 0.81816 0.34568 2.367 0.01799 *
## reginol_node_positive -0.31002 0.14135 -2.193 0.02835 *
## age -0.04076 0.04140 -0.985 0.32493
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.47 on 3999 degrees of freedom
## Multiple R-squared: 0.04452, Adjusted R-squared: 0.03879
## F-statistic: 7.764 on 24 and 3999 DF, p-value: < 2.2e-16
Based on the above indicator test summary, we delete grade and x6th_stage.
lmfit=lm(survival_months ~ race + marital_status + t_stage + n_stage + differentiate + a_stage + estrogen_status + progesterone_status+ln_tumor + sqrt_examined+ reginol_node_positive+ age, data = newbc1)
summary(lmfit)
##
## Call:
## lm(formula = survival_months ~ race + marital_status + t_stage +
## n_stage + differentiate + a_stage + estrogen_status + progesterone_status +
## ln_tumor + sqrt_examined + reginol_node_positive + age, data = newbc1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.609 -15.585 1.257 18.080 56.256
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.17078 4.76959 13.245 < 2e-16 ***
## race2 -3.72212 1.40154 -2.656 0.00794 **
## race3 1.90515 1.32220 1.441 0.14969
## marital_status2 -0.71759 1.11560 -0.643 0.52011
## marital_status3 -0.71874 1.02534 -0.701 0.48336
## marital_status4 -1.49914 1.56989 -0.955 0.33967
## marital_status5 -7.06187 3.39685 -2.079 0.03769 *
## t_stage2 -1.29191 1.14280 -1.130 0.25834
## t_stage3 -0.86768 2.01176 -0.431 0.66627
## t_stage4 -0.71612 2.78843 -0.257 0.79733
## n_stage2 -1.54282 1.06924 -1.443 0.14912
## n_stage3 -3.94737 2.23622 -1.765 0.07761 .
## differentiate2 1.00442 0.85141 1.180 0.23818
## differentiate3 0.95437 1.22513 0.779 0.43603
## differentiate4 -2.15661 5.21968 -0.413 0.67950
## a_stage1 4.27338 2.66881 1.601 0.10940
## estrogen_status1 8.57571 1.68488 5.090 3.75e-07 ***
## progesterone_status1 1.65003 1.10050 1.499 0.13386
## ln_tumor -1.12285 1.02902 -1.091 0.27526
## sqrt_examined 0.82002 0.34542 2.374 0.01765 *
## reginol_node_positive -0.30629 0.14084 -2.175 0.02971 *
## age -0.04166 0.04135 -1.007 0.31377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.47 on 4002 degrees of freedom
## Multiple R-squared: 0.04432, Adjusted R-squared: 0.03931
## F-statistic: 8.838 on 21 and 4002 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lmfit)
glmfit <- glm(status ~ race + marital_status + t_stage + n_stage + differentiate +
a_stage + estrogen_status + progesterone_status + ln_tumor +
sqrt_examined + reginol_node_positive + age,
data = newbc2, family = binomial)
summary(glmfit)
##
## Call:
## glm(formula = status ~ race + marital_status + t_stage + n_stage +
## differentiate + a_stage + estrogen_status + progesterone_status +
## ln_tumor + sqrt_examined + reginol_node_positive + age, family = binomial,
## data = newbc2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.775787 0.587386 3.023 0.002501 **
## race2 -0.522765 0.161744 -3.232 0.001229 **
## race3 0.419131 0.202312 2.072 0.038293 *
## marital_status2 -0.209325 0.141735 -1.477 0.139709
## marital_status3 -0.140584 0.134542 -1.045 0.296064
## marital_status4 -0.222113 0.192496 -1.154 0.248558
## marital_status5 -0.867369 0.370116 -2.344 0.019104 *
## t_stage2 -0.372488 0.159129 -2.341 0.019243 *
## t_stage3 -0.471905 0.266764 -1.769 0.076894 .
## t_stage4 -1.025962 0.313035 -3.277 0.001047 **
## n_stage2 -0.474091 0.129288 -3.667 0.000245 ***
## n_stage3 -0.637101 0.237639 -2.681 0.007341 **
## differentiate2 0.386027 0.104898 3.680 0.000233 ***
## differentiate3 0.917457 0.192668 4.762 1.92e-06 ***
## differentiate4 -0.947095 0.531127 -1.783 0.074557 .
## a_stage1 0.045211 0.265718 0.170 0.864894
## estrogen_status1 0.737828 0.177538 4.156 3.24e-05 ***
## progesterone_status1 0.588385 0.127499 4.615 3.93e-06 ***
## ln_tumor -0.055274 0.138631 -0.399 0.690103
## sqrt_examined 0.255997 0.049654 5.156 2.53e-07 ***
## reginol_node_positive -0.074961 0.014984 -5.003 5.65e-07 ***
## age -0.023647 0.005618 -4.209 2.56e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3444.7 on 4023 degrees of freedom
## Residual deviance: 2953.4 on 4002 degrees of freedom
## AIC: 2997.4
##
## Number of Fisher Scoring iterations: 5
par(mfrow = c(2, 2))
plot(glmfit)
# Our 1st global model for the predicts without collinearity and not normal predicts
model_global_num = lm(survival_months ~ race + marital_status + t_stage + n_stage +
differentiate + a_stage + estrogen_status + progesterone_status+ln_tumor + sqrt_examined+age, newbc1)
summary(model_global_num)
##
## Call:
## lm(formula = survival_months ~ race + marital_status + t_stage +
## n_stage + differentiate + a_stage + estrogen_status + progesterone_status +
## ln_tumor + sqrt_examined + age, data = newbc1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.099 -15.622 1.257 18.199 53.466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.72848 4.76490 13.375 < 2e-16 ***
## race2 -3.64322 1.40172 -2.599 0.00938 **
## race3 1.89824 1.32282 1.435 0.15137
## marital_status2 -0.72900 1.11611 -0.653 0.51369
## marital_status3 -0.73946 1.02577 -0.721 0.47102
## marital_status4 -1.48399 1.57061 -0.945 0.34479
## marital_status5 -7.37323 3.39542 -2.172 0.02995 *
## t_stage2 -1.29134 1.14333 -1.129 0.25877
## t_stage3 -0.85478 2.01269 -0.425 0.67108
## t_stage4 -0.87978 2.78871 -0.315 0.75241
## n_stage2 -2.68831 0.93094 -2.888 0.00390 **
## n_stage3 -7.94856 1.27167 -6.250 4.52e-10 ***
## differentiate2 0.98469 0.85176 1.156 0.24772
## differentiate3 0.92668 1.22564 0.756 0.44964
## differentiate4 -2.15317 5.22211 -0.412 0.68013
## a_stage1 4.06226 2.66828 1.522 0.12798
## estrogen_status1 8.57788 1.68567 5.089 3.77e-07 ***
## progesterone_status1 1.63586 1.10099 1.486 0.13741
## ln_tumor -1.16315 1.02933 -1.130 0.25854
## sqrt_examined 0.64625 0.33621 1.922 0.05466 .
## age -0.04416 0.04135 -1.068 0.28563
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.48 on 4003 degrees of freedom
## Multiple R-squared: 0.04319, Adjusted R-squared: 0.03841
## F-statistic: 9.035 on 20 and 4003 DF, p-value: < 2.2e-16
From the summary of the global model, we can see that from the P-value of race, t_stage, differentiate, a_stage, progesterone_status, ln_tumor, sqrt_examined, and age all have the p-value above 0.05 which is not significant. While sqrt_examined have a 0.0559 p-value which is a close call, so we may consider keep it. Since the covariates like marital_status, n_stage, estrogen_status have a P-value smaller than 0.05 indicating significance. Therefore, our 1st partial test may include the significant covariates.
# 1st numeric partial test with all significant covariates
nummodel_partial_1 = lm(survival_months ~ marital_status + n_stage + estrogen_status + sqrt_examined, newbc1)
summary(nummodel_partial_1)
##
## Call:
## lm(formula = survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined, data = newbc1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.161 -15.683 1.245 18.270 52.520
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.6132 1.8481 33.339 < 2e-16 ***
## marital_status2 -1.1696 1.1130 -1.051 0.293396
## marital_status3 -1.3670 1.0091 -1.355 0.175583
## marital_status4 -2.1762 1.5344 -1.418 0.156189
## marital_status5 -8.0320 3.3909 -2.369 0.017898 *
## n_stage2 -3.3926 0.9133 -3.715 0.000206 ***
## n_stage3 -9.6707 1.1909 -8.120 6.13e-16 ***
## estrogen_status1 10.4829 1.4306 7.327 2.82e-13 ***
## sqrt_examined 0.6343 0.3355 1.891 0.058733 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.52 on 4015 degrees of freedom
## Multiple R-squared: 0.03623, Adjusted R-squared: 0.03431
## F-statistic: 18.87 on 8 and 4015 DF, p-value: < 2.2e-16
# 2nd numeric partial test with all in-significant covariates
nummodel_partial_2 = lm(survival_months ~ race + t_stage + differentiate + a_stage + progesterone_status + ln_tumor + age, newbc1)
summary(nummodel_partial_2)
##
## Call:
## lm(formula = survival_months ~ race + t_stage + differentiate +
## a_stage + progesterone_status + ln_tumor + age, data = newbc1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73.014 -15.237 1.229 18.385 49.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.19189 4.50616 14.689 < 2e-16 ***
## race2 -4.43559 1.39084 -3.189 0.001438 **
## race3 1.80738 1.33187 1.357 0.174848
## t_stage2 -1.51866 1.15089 -1.320 0.187061
## t_stage3 -1.33650 2.02590 -0.660 0.509480
## t_stage4 -1.54160 2.80670 -0.549 0.582860
## differentiate2 2.06327 0.84699 2.436 0.014894 *
## differentiate3 2.17977 1.22124 1.785 0.074355 .
## differentiate4 -4.14026 5.25692 -0.788 0.430988
## a_stage1 8.85815 2.58816 3.423 0.000627 ***
## progesterone_status1 4.86565 0.96354 5.050 4.62e-07 ***
## ln_tumor -1.66881 1.03323 -1.615 0.106358
## age -0.04877 0.04041 -1.207 0.227487
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.67 on 4011 degrees of freedom
## Multiple R-squared: 0.02513, Adjusted R-squared: 0.02221
## F-statistic: 8.617 on 12 and 4011 DF, p-value: < 2.2e-16
# 3rd numeric partial test with some significant in 2nd test added
nummodel_partial_3 = lm(survival_months ~ marital_status + n_stage + estrogen_status + sqrt_examined + a_stage + progesterone_status + ln_tumor, newbc1)
summary(nummodel_partial_3)
##
## Call:
## lm(formula = survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined + a_stage + progesterone_status + ln_tumor,
## data = newbc1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.751 -15.561 1.111 18.340 53.113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.7481 3.5394 17.729 < 2e-16 ***
## marital_status2 -1.0285 1.1121 -0.925 0.35509
## marital_status3 -1.2491 1.0082 -1.239 0.21546
## marital_status4 -2.1415 1.5331 -1.397 0.16254
## marital_status5 -7.8368 3.3883 -2.313 0.02078 *
## n_stage2 -2.8026 0.9282 -3.019 0.00255 **
## n_stage3 -8.2702 1.2634 -6.546 6.66e-11 ***
## estrogen_status1 8.8999 1.6608 5.359 8.85e-08 ***
## sqrt_examined 0.6575 0.3355 1.960 0.05010 .
## a_stage1 3.9132 2.4926 1.570 0.11650
## progesterone_status1 1.8102 1.0946 1.654 0.09826 .
## ln_tumor -1.6793 0.5603 -2.997 0.00274 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.49 on 4012 degrees of freedom
## Multiple R-squared: 0.03978, Adjusted R-squared: 0.03715
## F-statistic: 15.11 on 11 and 4012 DF, p-value: < 2.2e-16
# Our 1st global model for the predicts without colinearity and not normal predicts
model_global_bin = glm(status ~ race + marital_status + t_stage + n_stage +
differentiate + a_stage +estrogen_status + progesterone_status + ln_tumor + sqrt_examined + age, family = binomial, newbc2)
summary(model_global_bin)
##
## Call:
## glm(formula = status ~ race + marital_status + t_stage + n_stage +
## differentiate + a_stage + estrogen_status + progesterone_status +
## ln_tumor + sqrt_examined + age, family = binomial, data = newbc2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.025431 0.583766 3.470 0.000521 ***
## race2 -0.490040 0.161448 -3.035 0.002403 **
## race3 0.412102 0.201264 2.048 0.040602 *
## marital_status2 -0.213770 0.140919 -1.517 0.129273
## marital_status3 -0.149790 0.134179 -1.116 0.264275
## marital_status4 -0.220417 0.191076 -1.154 0.248681
## marital_status5 -0.936900 0.365551 -2.563 0.010378 *
## t_stage2 -0.355769 0.158618 -2.243 0.024902 *
## t_stage3 -0.450653 0.266574 -1.691 0.090925 .
## t_stage4 -1.044577 0.312476 -3.343 0.000829 ***
## n_stage2 -0.733552 0.117910 -6.221 4.93e-10 ***
## n_stage3 -1.575446 0.141592 -11.127 < 2e-16 ***
## differentiate2 0.375295 0.104473 3.592 0.000328 ***
## differentiate3 0.901142 0.192252 4.687 2.77e-06 ***
## differentiate4 -0.960780 0.535816 -1.793 0.072955 .
## a_stage1 0.002227 0.262767 0.008 0.993239
## estrogen_status1 0.730261 0.176959 4.127 3.68e-05 ***
## progesterone_status1 0.573254 0.127304 4.503 6.70e-06 ***
## ln_tumor -0.065530 0.138852 -0.472 0.636966
## sqrt_examined 0.180931 0.046777 3.868 0.000110 ***
## age -0.024590 0.005591 -4.398 1.09e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3444.7 on 4023 degrees of freedom
## Residual deviance: 2978.6 on 4003 degrees of freedom
## AIC: 3020.6
##
## Number of Fisher Scoring iterations: 5
From the summary of the global model, we can see that from the P-value of a_stage, ln_tumor, marital_status(2, 3, 4) have the p-value above 0.05 which is not significant. While differentiate has a 0.0729 p-value, and t_stage3 has a p-value of 0.0909, we may consider keep them. The covariates like t_stage, n_stage, race, estrogen_status, progesterone_status, sqrt_examined, age have a P-value smaller than 0.05 indicating significance. Therefore, our 1st partial test may include the significant covariates.
# 1st binary partial test with all significant covariates
# We will still keep marital_status because one group in this variable is significant
binmodel_partial_1 = glm(status ~ race + marital_status + t_stage + n_stage + differentiate + estrogen_status + progesterone_status + sqrt_examined + age, family = binomial, newbc2)
summary(binmodel_partial_1)
##
## Call:
## glm(formula = status ~ race + marital_status + t_stage + n_stage +
## differentiate + estrogen_status + progesterone_status + sqrt_examined +
## age, family = binomial, data = newbc2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.851463 0.377142 4.909 9.15e-07 ***
## race2 -0.488627 0.161358 -3.028 0.002460 **
## race3 0.411208 0.201230 2.043 0.041006 *
## marital_status2 -0.215703 0.140825 -1.532 0.125594
## marital_status3 -0.151385 0.134089 -1.129 0.258903
## marital_status4 -0.221820 0.191055 -1.161 0.245631
## marital_status5 -0.938507 0.365357 -2.569 0.010207 *
## t_stage2 -0.408664 0.112638 -3.628 0.000285 ***
## t_stage3 -0.555367 0.148281 -3.745 0.000180 ***
## t_stage4 -1.125881 0.244474 -4.605 4.12e-06 ***
## n_stage2 -0.737853 0.117555 -6.277 3.46e-10 ***
## n_stage3 -1.581593 0.137038 -11.541 < 2e-16 ***
## differentiate2 0.377070 0.104385 3.612 0.000303 ***
## differentiate3 0.904754 0.191998 4.712 2.45e-06 ***
## differentiate4 -0.961568 0.535510 -1.796 0.072557 .
## estrogen_status1 0.730173 0.176789 4.130 3.62e-05 ***
## progesterone_status1 0.573495 0.127268 4.506 6.60e-06 ***
## sqrt_examined 0.181071 0.046740 3.874 0.000107 ***
## age -0.024489 0.005585 -4.385 1.16e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3444.7 on 4023 degrees of freedom
## Residual deviance: 2978.9 on 4005 degrees of freedom
## AIC: 3016.9
##
## Number of Fisher Scoring iterations: 5
# 2nd binary partial test with all in-significant covariates
binmodel_partial_2 = glm(status ~ a_stage + ln_tumor, family = binomial, newbc2)
summary(binmodel_partial_2)
##
## Call:
## glm(formula = status ~ a_stage + ln_tumor, family = binomial,
## data = newbc2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.53355 0.34110 7.428 1.11e-13 ***
## a_stage1 1.02938 0.22401 4.595 4.32e-06 ***
## ln_tumor -0.55245 0.07032 -7.856 3.97e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3444.7 on 4023 degrees of freedom
## Residual deviance: 3351.7 on 4021 degrees of freedom
## AIC: 3357.7
##
## Number of Fisher Scoring iterations: 4
# 3rd binary partial test with some significant in 2nd test added
binmodel_partial_3 = glm(status ~ marital_status + t_stage + n_stage + differentiate + estrogen_status + progesterone_status + sqrt_examined + age + a_stage + ln_tumor, family = binomial, newbc2)
summary(binmodel_partial_3)
##
## Call:
## glm(formula = status ~ marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age +
## a_stage + ln_tumor, family = binomial, data = newbc2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.985848 0.582219 3.411 0.000648 ***
## marital_status2 -0.247509 0.140237 -1.765 0.077575 .
## marital_status3 -0.231808 0.131581 -1.762 0.078119 .
## marital_status4 -0.281274 0.189215 -1.487 0.137138
## marital_status5 -0.986920 0.363073 -2.718 0.006563 **
## t_stage2 -0.346774 0.158600 -2.186 0.028781 *
## t_stage3 -0.450100 0.267133 -1.685 0.092003 .
## t_stage4 -1.056179 0.311620 -3.389 0.000701 ***
## n_stage2 -0.736774 0.117479 -6.272 3.57e-10 ***
## n_stage3 -1.584207 0.141279 -11.213 < 2e-16 ***
## differentiate2 0.390591 0.104008 3.755 0.000173 ***
## differentiate3 0.920784 0.191675 4.804 1.56e-06 ***
## differentiate4 -0.993288 0.534044 -1.860 0.062894 .
## estrogen_status1 0.731365 0.176189 4.151 3.31e-05 ***
## progesterone_status1 0.573479 0.126984 4.516 6.30e-06 ***
## sqrt_examined 0.183193 0.046609 3.930 8.48e-05 ***
## age -0.024564 0.005550 -4.426 9.60e-06 ***
## a_stage1 -0.004192 0.262185 -0.016 0.987243
## ln_tumor -0.056204 0.139460 -0.403 0.686939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3444.7 on 4023 degrees of freedom
## Residual deviance: 2993.1 on 4005 degrees of freedom
## AIC: 3031.1
##
## Number of Fisher Scoring iterations: 5
numglobal_backward = stepAIC(model_global_num, direction = "backward")
## Start: AIC=25070.25
## survival_months ~ race + marital_status + t_stage + n_stage +
## differentiate + a_stage + estrogen_status + progesterone_status +
## ln_tumor + sqrt_examined + age
##
## Df Sum of Sq RSS AIC
## - differentiate 3 836.3 2023197 25066
## - t_stage 3 846.4 2023207 25066
## - marital_status 4 2945.4 2025306 25068
## - age 1 576.2 2022937 25069
## - ln_tumor 1 645.1 2023006 25070
## <none> 2022361 25070
## - progesterone_status 1 1115.3 2023476 25070
## - a_stage 1 1171.0 2023532 25071
## - sqrt_examined 1 1866.6 2024227 25072
## - race 2 4781.2 2027142 25076
## - estrogen_status 1 13082.5 2035443 25094
## - n_stage 2 20595.8 2042956 25107
##
## Step: AIC=25065.91
## survival_months ~ race + marital_status + t_stage + n_stage +
## a_stage + estrogen_status + progesterone_status + ln_tumor +
## sqrt_examined + age
##
## Df Sum of Sq RSS AIC
## - t_stage 3 882.6 2024079 25062
## - marital_status 4 2962.9 2026160 25064
## - age 1 479.8 2023677 25065
## - ln_tumor 1 716.7 2023914 25065
## <none> 2023197 25066
## - a_stage 1 1125.8 2024323 25066
## - progesterone_status 1 1248.8 2024446 25066
## - sqrt_examined 1 1874.8 2025072 25068
## - race 2 5014.2 2028211 25072
## - estrogen_status 1 14398.4 2037595 25092
## - n_stage 2 21605.8 2044803 25105
##
## Step: AIC=25061.67
## survival_months ~ race + marital_status + n_stage + a_stage +
## estrogen_status + progesterone_status + ln_tumor + sqrt_examined +
## age
##
## Df Sum of Sq RSS AIC
## - marital_status 4 3040.5 2027120 25060
## - age 1 490.1 2024570 25061
## <none> 2024079 25062
## - progesterone_status 1 1249.9 2025329 25062
## - a_stage 1 1279.0 2025358 25062
## - sqrt_examined 1 1802.0 2025881 25063
## - race 2 4999.6 2029079 25068
## - ln_tumor 1 5102.6 2029182 25070
## - estrogen_status 1 14429.7 2038509 25088
## - n_stage 2 21732.7 2045812 25101
##
## Step: AIC=25059.71
## survival_months ~ race + n_stage + a_stage + estrogen_status +
## progesterone_status + ln_tumor + sqrt_examined + age
##
## Df Sum of Sq RSS AIC
## - age 1 630.3 2027750 25059
## <none> 2027120 25060
## - a_stage 1 1348.9 2028469 25060
## - progesterone_status 1 1388.3 2028508 25060
## - sqrt_examined 1 1851.9 2028972 25061
## - race 2 6008.1 2033128 25068
## - ln_tumor 1 5187.6 2032308 25068
## - estrogen_status 1 14491.8 2041612 25086
## - n_stage 2 22365.2 2049485 25100
##
## Step: AIC=25058.96
## survival_months ~ race + n_stage + a_stage + estrogen_status +
## progesterone_status + ln_tumor + sqrt_examined
##
## Df Sum of Sq RSS AIC
## <none> 2027750 25059
## - a_stage 1 1315.7 2029066 25060
## - progesterone_status 1 1511.7 2029262 25060
## - sqrt_examined 1 1947.2 2029698 25061
## - ln_tumor 1 4929.4 2032680 25067
## - race 2 5995.2 2033745 25067
## - estrogen_status 1 14111.6 2041862 25085
## - n_stage 2 22758.1 2050508 25100
num1_backward = stepAIC(nummodel_partial_1, direction = "backward")
## Start: AIC=25075.43
## survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined
##
## Df Sum of Sq RSS AIC
## <none> 2037077 25075
## - marital_status 4 4519 2041596 25076
## - sqrt_examined 1 1814 2038891 25077
## - estrogen_status 1 27241 2064317 25127
## - n_stage 2 35253 2072329 25140
num2_backward = stepAIC(nummodel_partial_2, direction = "backward")
## Start: AIC=25129.5
## survival_months ~ race + t_stage + differentiate + a_stage +
## progesterone_status + ln_tumor + age
##
## Df Sum of Sq RSS AIC
## - t_stage 3 1017.9 2061552 25126
## - age 1 748.5 2061282 25129
## <none> 2060534 25130
## - ln_tumor 1 1340.1 2061874 25130
## - differentiate 3 3850.4 2064384 25131
## - race 2 6602.0 2067136 25138
## - a_stage 1 6017.7 2066552 25139
## - progesterone_status 1 13099.9 2073634 25153
##
## Step: AIC=25125.49
## survival_months ~ race + differentiate + a_stage + progesterone_status +
## ln_tumor + age
##
## Df Sum of Sq RSS AIC
## - age 1 760.6 2062312 25125
## <none> 2061552 25126
## - differentiate 3 3959.8 2065512 25127
## - race 2 6616.8 2068169 25134
## - a_stage 1 7095.3 2068647 25137
## - ln_tumor 1 10180.1 2071732 25143
## - progesterone_status 1 13093.7 2074645 25149
##
## Step: AIC=25124.97
## survival_months ~ race + differentiate + a_stage + progesterone_status +
## ln_tumor
##
## Df Sum of Sq RSS AIC
## <none> 2062312 25125
## - differentiate 3 3693.9 2066006 25126
## - race 2 6605.5 2068918 25134
## - a_stage 1 7048.3 2069361 25137
## - ln_tumor 1 9857.2 2072170 25142
## - progesterone_status 1 13410.4 2075723 25149
num3_backward = stepAIC(nummodel_partial_3, direction = "backward")
## Start: AIC=25066.58
## survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined + a_stage + progesterone_status + ln_tumor
##
## Df Sum of Sq RSS AIC
## <none> 2029574 25067
## - marital_status 4 4171.9 2033745 25067
## - a_stage 1 1246.9 2030820 25067
## - progesterone_status 1 1383.5 2030957 25067
## - sqrt_examined 1 1942.7 2031516 25068
## - ln_tumor 1 4544.3 2034118 25074
## - estrogen_status 1 14527.3 2044101 25093
## - n_stage 2 22634.1 2052208 25107
numglobal_backward = stepAIC(model_global_num, direction = "forward")
## Start: AIC=25070.25
## survival_months ~ race + marital_status + t_stage + n_stage +
## differentiate + a_stage + estrogen_status + progesterone_status +
## ln_tumor + sqrt_examined + age
num1_backward = stepAIC(nummodel_partial_1, direction = "forward")
## Start: AIC=25075.43
## survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined
num2_backward = stepAIC(nummodel_partial_2, direction = "forward")
## Start: AIC=25129.5
## survival_months ~ race + t_stage + differentiate + a_stage +
## progesterone_status + ln_tumor + age
num3_backward = stepAIC(nummodel_partial_3, direction = "forward")
## Start: AIC=25066.58
## survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined + a_stage + progesterone_status + ln_tumor
numglobal_backward = stepAIC(model_global_num, direction = "both")
## Start: AIC=25070.25
## survival_months ~ race + marital_status + t_stage + n_stage +
## differentiate + a_stage + estrogen_status + progesterone_status +
## ln_tumor + sqrt_examined + age
##
## Df Sum of Sq RSS AIC
## - differentiate 3 836.3 2023197 25066
## - t_stage 3 846.4 2023207 25066
## - marital_status 4 2945.4 2025306 25068
## - age 1 576.2 2022937 25069
## - ln_tumor 1 645.1 2023006 25070
## <none> 2022361 25070
## - progesterone_status 1 1115.3 2023476 25070
## - a_stage 1 1171.0 2023532 25071
## - sqrt_examined 1 1866.6 2024227 25072
## - race 2 4781.2 2027142 25076
## - estrogen_status 1 13082.5 2035443 25094
## - n_stage 2 20595.8 2042956 25107
##
## Step: AIC=25065.91
## survival_months ~ race + marital_status + t_stage + n_stage +
## a_stage + estrogen_status + progesterone_status + ln_tumor +
## sqrt_examined + age
##
## Df Sum of Sq RSS AIC
## - t_stage 3 882.6 2024079 25062
## - marital_status 4 2962.9 2026160 25064
## - age 1 479.8 2023677 25065
## - ln_tumor 1 716.7 2023914 25065
## <none> 2023197 25066
## - a_stage 1 1125.8 2024323 25066
## - progesterone_status 1 1248.8 2024446 25066
## - sqrt_examined 1 1874.8 2025072 25068
## + differentiate 3 836.3 2022361 25070
## - race 2 5014.2 2028211 25072
## - estrogen_status 1 14398.4 2037595 25092
## - n_stage 2 21605.8 2044803 25105
##
## Step: AIC=25061.67
## survival_months ~ race + marital_status + n_stage + a_stage +
## estrogen_status + progesterone_status + ln_tumor + sqrt_examined +
## age
##
## Df Sum of Sq RSS AIC
## - marital_status 4 3040.5 2027120 25060
## - age 1 490.1 2024570 25061
## <none> 2024079 25062
## - progesterone_status 1 1249.9 2025329 25062
## - a_stage 1 1279.0 2025358 25062
## - sqrt_examined 1 1802.0 2025881 25063
## + t_stage 3 882.6 2023197 25066
## + differentiate 3 872.5 2023207 25066
## - race 2 4999.6 2029079 25068
## - ln_tumor 1 5102.6 2029182 25070
## - estrogen_status 1 14429.7 2038509 25088
## - n_stage 2 21732.7 2045812 25101
##
## Step: AIC=25059.71
## survival_months ~ race + n_stage + a_stage + estrogen_status +
## progesterone_status + ln_tumor + sqrt_examined + age
##
## Df Sum of Sq RSS AIC
## - age 1 630.3 2027750 25059
## <none> 2027120 25060
## - a_stage 1 1348.9 2028469 25060
## - progesterone_status 1 1388.3 2028508 25060
## - sqrt_examined 1 1851.9 2028972 25061
## + marital_status 4 3040.5 2024079 25062
## + t_stage 3 960.2 2026160 25064
## + differentiate 3 889.9 2026230 25064
## - race 2 6008.1 2033128 25068
## - ln_tumor 1 5187.6 2032308 25068
## - estrogen_status 1 14491.8 2041612 25086
## - n_stage 2 22365.2 2049485 25100
##
## Step: AIC=25058.96
## survival_months ~ race + n_stage + a_stage + estrogen_status +
## progesterone_status + ln_tumor + sqrt_examined
##
## Df Sum of Sq RSS AIC
## <none> 2027750 25059
## - a_stage 1 1315.7 2029066 25060
## + age 1 630.3 2027120 25060
## - progesterone_status 1 1511.7 2029262 25060
## + marital_status 4 3180.7 2024570 25061
## - sqrt_examined 1 1947.2 2029698 25061
## + t_stage 3 972.1 2026778 25063
## + differentiate 3 779.2 2026971 25063
## - ln_tumor 1 4929.4 2032680 25067
## - race 2 5995.2 2033745 25067
## - estrogen_status 1 14111.6 2041862 25085
## - n_stage 2 22758.1 2050508 25100
num1_backward = stepAIC(nummodel_partial_1, direction = "both")
## Start: AIC=25075.43
## survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined
##
## Df Sum of Sq RSS AIC
## <none> 2037077 25075
## - marital_status 4 4519 2041596 25076
## - sqrt_examined 1 1814 2038891 25077
## - estrogen_status 1 27241 2064317 25127
## - n_stage 2 35253 2072329 25140
num2_backward = stepAIC(nummodel_partial_2, direction = "both")
## Start: AIC=25129.5
## survival_months ~ race + t_stage + differentiate + a_stage +
## progesterone_status + ln_tumor + age
##
## Df Sum of Sq RSS AIC
## - t_stage 3 1017.9 2061552 25126
## - age 1 748.5 2061282 25129
## <none> 2060534 25130
## - ln_tumor 1 1340.1 2061874 25130
## - differentiate 3 3850.4 2064384 25131
## - race 2 6602.0 2067136 25138
## - a_stage 1 6017.7 2066552 25139
## - progesterone_status 1 13099.9 2073634 25153
##
## Step: AIC=25125.49
## survival_months ~ race + differentiate + a_stage + progesterone_status +
## ln_tumor + age
##
## Df Sum of Sq RSS AIC
## - age 1 760.6 2062312 25125
## <none> 2061552 25126
## - differentiate 3 3959.8 2065512 25127
## + t_stage 3 1017.9 2060534 25130
## - race 2 6616.8 2068169 25134
## - a_stage 1 7095.3 2068647 25137
## - ln_tumor 1 10180.1 2071732 25143
## - progesterone_status 1 13093.7 2074645 25149
##
## Step: AIC=25124.97
## survival_months ~ race + differentiate + a_stage + progesterone_status +
## ln_tumor
##
## Df Sum of Sq RSS AIC
## <none> 2062312 25125
## + age 1 760.6 2061552 25126
## - differentiate 3 3693.9 2066006 25126
## + t_stage 3 1030.1 2061282 25129
## - race 2 6605.5 2068918 25134
## - a_stage 1 7048.3 2069361 25137
## - ln_tumor 1 9857.2 2072170 25142
## - progesterone_status 1 13410.4 2075723 25149
num3_backward = stepAIC(nummodel_partial_3, direction = "both")
## Start: AIC=25066.58
## survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined + a_stage + progesterone_status + ln_tumor
##
## Df Sum of Sq RSS AIC
## <none> 2029574 25067
## - marital_status 4 4171.9 2033745 25067
## - a_stage 1 1246.9 2030820 25067
## - progesterone_status 1 1383.5 2030957 25067
## - sqrt_examined 1 1942.7 2031516 25068
## - ln_tumor 1 4544.3 2034118 25074
## - estrogen_status 1 14527.3 2044101 25093
## - n_stage 2 22634.1 2052208 25107
binglobal_backward = stepAIC(model_global_bin, direction = "backward")
## Start: AIC=3020.64
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## a_stage + estrogen_status + progesterone_status + ln_tumor +
## sqrt_examined + age
##
## Df Deviance AIC
## - a_stage 1 2978.6 3018.6
## - ln_tumor 1 2978.9 3018.9
## <none> 2978.6 3020.6
## - marital_status 4 2987.5 3021.5
## - t_stage 3 2990.2 3026.2
## - race 2 2993.1 3031.1
## - sqrt_examined 1 2993.7 3033.7
## - estrogen_status 1 2995.5 3035.5
## - progesterone_status 1 2997.9 3037.9
## - age 1 2998.3 3038.3
## - differentiate 3 3011.0 3047.0
## - n_stage 2 3107.3 3145.3
##
## Step: AIC=3018.64
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + ln_tumor + sqrt_examined +
## age
##
## Df Deviance AIC
## - ln_tumor 1 2978.9 3016.9
## <none> 2978.6 3018.6
## - marital_status 4 2987.5 3019.5
## - t_stage 3 2991.2 3025.2
## - race 2 2993.1 3029.1
## - sqrt_examined 1 2993.7 3031.7
## - estrogen_status 1 2995.5 3033.5
## - progesterone_status 1 2997.9 3035.9
## - age 1 2998.3 3036.3
## - differentiate 3 3011.0 3045.0
## - n_stage 2 3113.8 3149.8
##
## Step: AIC=3016.87
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age
##
## Df Deviance AIC
## <none> 2978.9 3016.9
## - marital_status 4 2987.8 3017.8
## - race 2 2993.2 3027.2
## - sqrt_examined 1 2994.0 3030.0
## - estrogen_status 1 2995.7 3031.7
## - progesterone_status 1 2998.1 3034.1
## - age 1 2998.4 3034.4
## - t_stage 3 3008.6 3040.6
## - differentiate 3 3011.6 3043.6
## - n_stage 2 3116.4 3150.4
bin1_backward = stepAIC(binmodel_partial_1, direction = "backward")
## Start: AIC=3016.87
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age
##
## Df Deviance AIC
## <none> 2978.9 3016.9
## - marital_status 4 2987.8 3017.8
## - race 2 2993.2 3027.2
## - sqrt_examined 1 2994.0 3030.0
## - estrogen_status 1 2995.7 3031.7
## - progesterone_status 1 2998.1 3034.1
## - age 1 2998.4 3034.4
## - t_stage 3 3008.6 3040.6
## - differentiate 3 3011.6 3043.6
## - n_stage 2 3116.4 3150.4
bin2_backward = stepAIC(binmodel_partial_2, direction = "backward")
## Start: AIC=3357.74
## status ~ a_stage + ln_tumor
##
## Df Deviance AIC
## <none> 3351.7 3357.7
## - a_stage 1 3370.9 3374.9
## - ln_tumor 1 3415.7 3419.7
bin3_backward = stepAIC(binmodel_partial_3, direction = "backward")
## Start: AIC=3031.09
## status ~ marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age +
## a_stage + ln_tumor
##
## Df Deviance AIC
## - a_stage 1 2993.1 3029.1
## - ln_tumor 1 2993.2 3029.2
## <none> 2993.1 3031.1
## - marital_status 4 3005.1 3035.1
## - t_stage 3 3004.8 3036.8
## - sqrt_examined 1 3008.6 3044.6
## - estrogen_status 1 3010.1 3046.1
## - progesterone_status 1 3012.4 3048.4
## - age 1 3013.0 3049.0
## - differentiate 3 3027.6 3059.6
## - n_stage 2 3123.8 3157.8
##
## Step: AIC=3029.09
## status ~ marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age +
## ln_tumor
##
## Df Deviance AIC
## - ln_tumor 1 2993.2 3027.2
## <none> 2993.1 3029.1
## - marital_status 4 3005.1 3033.1
## - t_stage 3 3005.8 3035.8
## - sqrt_examined 1 3008.6 3042.6
## - estrogen_status 1 3010.1 3044.1
## - progesterone_status 1 3012.4 3046.4
## - age 1 3013.1 3047.1
## - differentiate 3 3027.6 3057.6
## - n_stage 2 3130.2 3162.2
##
## Step: AIC=3027.25
## status ~ marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age
##
## Df Deviance AIC
## <none> 2993.2 3027.3
## - marital_status 4 3005.3 3031.3
## - sqrt_examined 1 3008.8 3040.8
## - estrogen_status 1 3010.3 3042.3
## - progesterone_status 1 3012.6 3044.6
## - age 1 3013.1 3045.1
## - t_stage 3 3022.2 3050.2
## - differentiate 3 3028.1 3056.1
## - n_stage 2 3132.5 3162.5
From the backward elimination, the best model with lowest AIC of 3016.87 is: status ~ race + marital_status + t_stage + n_stage + differentiate + estrogen_status + progesterone_status + sqrt_examined + age.
binglobal_backward = stepAIC(model_global_bin, direction = "forward")
## Start: AIC=3020.64
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## a_stage + estrogen_status + progesterone_status + ln_tumor +
## sqrt_examined + age
bin1_backward = stepAIC(binmodel_partial_1, direction = "forward")
## Start: AIC=3016.87
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age
bin2_backward = stepAIC(binmodel_partial_2, direction = "forward")
## Start: AIC=3357.74
## status ~ a_stage + ln_tumor
bin3_backward = stepAIC(binmodel_partial_3, direction = "forward")
## Start: AIC=3031.09
## status ~ marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age +
## a_stage + ln_tumor
From the forward selection, the best model with lowest AIC of 3016.87 status ~ race + marital_status + t_stage + n_stage + differentiate + estrogen_status + progesterone_status + sqrt_examined + age. Backward and forward elimination selection both produce same AIC value and the models are the same.
binglobal_backward = stepAIC(model_global_bin, direction = "both")
## Start: AIC=3020.64
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## a_stage + estrogen_status + progesterone_status + ln_tumor +
## sqrt_examined + age
##
## Df Deviance AIC
## - a_stage 1 2978.6 3018.6
## - ln_tumor 1 2978.9 3018.9
## <none> 2978.6 3020.6
## - marital_status 4 2987.5 3021.5
## - t_stage 3 2990.2 3026.2
## - race 2 2993.1 3031.1
## - sqrt_examined 1 2993.7 3033.7
## - estrogen_status 1 2995.5 3035.5
## - progesterone_status 1 2997.9 3037.9
## - age 1 2998.3 3038.3
## - differentiate 3 3011.0 3047.0
## - n_stage 2 3107.3 3145.3
##
## Step: AIC=3018.64
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + ln_tumor + sqrt_examined +
## age
##
## Df Deviance AIC
## - ln_tumor 1 2978.9 3016.9
## <none> 2978.6 3018.6
## - marital_status 4 2987.5 3019.5
## + a_stage 1 2978.6 3020.6
## - t_stage 3 2991.2 3025.2
## - race 2 2993.1 3029.1
## - sqrt_examined 1 2993.7 3031.7
## - estrogen_status 1 2995.5 3033.5
## - progesterone_status 1 2997.9 3035.9
## - age 1 2998.3 3036.3
## - differentiate 3 3011.0 3045.0
## - n_stage 2 3113.8 3149.8
##
## Step: AIC=3016.87
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age
##
## Df Deviance AIC
## <none> 2978.9 3016.9
## - marital_status 4 2987.8 3017.8
## + ln_tumor 1 2978.6 3018.6
## + a_stage 1 2978.9 3018.9
## - race 2 2993.2 3027.2
## - sqrt_examined 1 2994.0 3030.0
## - estrogen_status 1 2995.7 3031.7
## - progesterone_status 1 2998.1 3034.1
## - age 1 2998.4 3034.4
## - t_stage 3 3008.6 3040.6
## - differentiate 3 3011.6 3043.6
## - n_stage 2 3116.4 3150.4
bin1_backward = stepAIC(binmodel_partial_1, direction = "both")
## Start: AIC=3016.87
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age
##
## Df Deviance AIC
## <none> 2978.9 3016.9
## - marital_status 4 2987.8 3017.8
## - race 2 2993.2 3027.2
## - sqrt_examined 1 2994.0 3030.0
## - estrogen_status 1 2995.7 3031.7
## - progesterone_status 1 2998.1 3034.1
## - age 1 2998.4 3034.4
## - t_stage 3 3008.6 3040.6
## - differentiate 3 3011.6 3043.6
## - n_stage 2 3116.4 3150.4
bin2_backward = stepAIC(binmodel_partial_2, direction = "both")
## Start: AIC=3357.74
## status ~ a_stage + ln_tumor
##
## Df Deviance AIC
## <none> 3351.7 3357.7
## - a_stage 1 3370.9 3374.9
## - ln_tumor 1 3415.7 3419.7
bin3_backward = stepAIC(binmodel_partial_3, direction = "both")
## Start: AIC=3031.09
## status ~ marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age +
## a_stage + ln_tumor
##
## Df Deviance AIC
## - a_stage 1 2993.1 3029.1
## - ln_tumor 1 2993.2 3029.2
## <none> 2993.1 3031.1
## - marital_status 4 3005.1 3035.1
## - t_stage 3 3004.8 3036.8
## - sqrt_examined 1 3008.6 3044.6
## - estrogen_status 1 3010.1 3046.1
## - progesterone_status 1 3012.4 3048.4
## - age 1 3013.0 3049.0
## - differentiate 3 3027.6 3059.6
## - n_stage 2 3123.8 3157.8
##
## Step: AIC=3029.09
## status ~ marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age +
## ln_tumor
##
## Df Deviance AIC
## - ln_tumor 1 2993.2 3027.2
## <none> 2993.1 3029.1
## + a_stage 1 2993.1 3031.1
## - marital_status 4 3005.1 3033.1
## - t_stage 3 3005.8 3035.8
## - sqrt_examined 1 3008.6 3042.6
## - estrogen_status 1 3010.1 3044.1
## - progesterone_status 1 3012.4 3046.4
## - age 1 3013.1 3047.1
## - differentiate 3 3027.6 3057.6
## - n_stage 2 3130.2 3162.2
##
## Step: AIC=3027.25
## status ~ marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age
##
## Df Deviance AIC
## <none> 2993.2 3027.3
## + ln_tumor 1 2993.1 3029.1
## + a_stage 1 2993.2 3029.3
## - marital_status 4 3005.3 3031.3
## - sqrt_examined 1 3008.8 3040.8
## - estrogen_status 1 3010.3 3042.3
## - progesterone_status 1 3012.6 3044.6
## - age 1 3013.1 3045.1
## - t_stage 3 3022.2 3050.2
## - differentiate 3 3028.1 3056.1
## - n_stage 2 3132.5 3162.5
# anova for numeric Y
newbc1$ln_tumor <- ifelse(newbc1$ln_tumor <= 0, NA, newbc1$ln_tumor)
# Apply transformations
newbc1$sqrt_examined <- sqrt(newbc1$sqrt_examined)
newbc1$ln_tumor <- log(newbc1$ln_tumor)
# If you decide to remove rows with any NA values
newbc1 <- na.omit(newbc1)
# Fit the linear model
lm_model <- lm(survival_months ~ marital_status + n_stage + estrogen_status +
sqrt_examined + a_stage + progesterone_status + ln_tumor,
data = newbc1)
# Perform ANOVA
anova_result <- anova(lm_model)
print(anova_result)
## Analysis of Variance Table
##
## Response: survival_months
## Df Sum Sq Mean Sq F value Pr(>F)
## marital_status 4 6820 1705.1 3.3664 0.009289 **
## n_stage 2 40939 20469.4 40.4129 < 2.2e-16 ***
## estrogen_status 1 27053 27053.0 53.4111 3.249e-13 ***
## sqrt_examined 1 1667 1667.4 3.2919 0.069695 .
## a_stage 1 1407 1407.4 2.7786 0.095607 .
## progesterone_status 1 1590 1590.4 3.1400 0.076472 .
## ln_tumor 1 3395 3395.3 6.7033 0.009658 **
## Residuals 4005 2028556 506.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Based on ANOVA test, matial_status, n_stage, estrogen_status, and ln_tumors are significant variables.
# delete insignificant variables from the above anova table and fit a new model
lm_model1 <- lm(survival_months ~ marital_status + n_stage + estrogen_status + ln_tumor,
data = newbc1)
# Perform ANOVA
anova_result1 <- anova(lm_model1)
print(anova_result1)
## Analysis of Variance Table
##
## Response: survival_months
## Df Sum Sq Mean Sq F value Pr(>F)
## marital_status 4 6820 1705.1 3.3611 0.009374 **
## n_stage 2 40939 20469.4 40.3500 < 2.2e-16 ***
## estrogen_status 1 27053 27053.0 53.3279 3.387e-13 ***
## ln_tumor 1 3375 3375.0 6.6529 0.009935 **
## Residuals 4008 2033241 507.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Partial f test for nested model
model_reduced <- lm(survival_months ~ marital_status + n_stage + estrogen_status + ln_tumor,
data = newbc1)
model_full <- lm(survival_months ~ marital_status + n_stage + estrogen_status +
sqrt_examined + a_stage + progesterone_status + ln_tumor,
data = newbc1)
anova(model_reduced, model_full)
## Analysis of Variance Table
##
## Model 1: survival_months ~ marital_status + n_stage + estrogen_status +
## ln_tumor
## Model 2: survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined + a_stage + progesterone_status + ln_tumor
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4008 2033241
## 2 4005 2028556 3 4685.5 3.0835 0.02625 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
With p significant p value, we have enough evidence to show that the full model is better.
model2 <- glm(status ~ race + marital_status + t_stage + n_stage + differentiate + estrogen_status + progesterone_status + sqrt_examined + age, data = newbc2)
anova_result1 <- anova(model2)
print(anova_result1)
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: status
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 4023 521.70
## race 2 3.6262 4021 518.08
## marital_status 4 2.9196 4017 515.16
## t_stage 3 13.1374 4014 502.02
## n_stage 2 25.1703 4012 476.85
## differentiate 3 6.8813 4009 469.97
## estrogen_status 1 8.9916 4008 460.98
## progesterone_status 1 2.9595 4007 458.02
## sqrt_examined 1 1.8003 4006 456.22
## age 1 2.1022 4005 454.11
summary(model2)
##
## Call:
## glm(formula = status ~ race + marital_status + t_stage + n_stage +
## differentiate + estrogen_status + progesterone_status + sqrt_examined +
## age, data = newbc2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7708680 0.0439145 17.554 < 2e-16 ***
## race2 -0.0681853 0.0209930 -3.248 0.001172 **
## race3 0.0416639 0.0198151 2.103 0.035560 *
## marital_status2 -0.0259111 0.0167149 -1.550 0.121178
## marital_status3 -0.0175357 0.0153627 -1.141 0.253752
## marital_status4 -0.0321326 0.0235225 -1.366 0.172004
## marital_status5 -0.1345247 0.0508574 -2.645 0.008198 **
## t_stage2 -0.0396395 0.0118704 -3.339 0.000847 ***
## t_stage3 -0.0603490 0.0174234 -3.464 0.000538 ***
## t_stage4 -0.1748054 0.0350531 -4.987 6.40e-07 ***
## n_stage2 -0.0837902 0.0139153 -6.021 1.88e-09 ***
## n_stage3 -0.2421576 0.0183695 -13.183 < 2e-16 ***
## differentiate2 0.0509827 0.0127519 3.998 6.50e-05 ***
## differentiate3 0.0895198 0.0183190 4.887 1.07e-06 ***
## differentiate4 -0.1824586 0.0781803 -2.334 0.019654 *
## estrogen_status1 0.1408815 0.0252403 5.582 2.54e-08 ***
## progesterone_status1 0.0779114 0.0164916 4.724 2.39e-06 ***
## sqrt_examined 0.0191703 0.0050357 3.807 0.000143 ***
## age -0.0026658 0.0006191 -4.306 1.70e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1133866)
##
## Null deviance: 521.70 on 4023 degrees of freedom
## Residual deviance: 454.11 on 4005 degrees of freedom
## AIC: 2680.5
##
## Number of Fisher Scoring iterations: 2
null_model <- glm(status ~ 1, family = binomial, data = newbc2)
anova(null_model, model2, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: status ~ 1
## Model 2: status ~ race + marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + age
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 4023 3444.7
## 2 4005 454.1 18 2990.6 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
With p significant p value, we have enough evidence to show that the full model is better, which is status ~ race + marital_status + t_stage + n_stage + differentiate + estrogen_status + progesterone_status + sqrt_examined + age
# AIC-based selection for the numeric outcome
aic_selected_num_model <- stepAIC(lm_model, direction = "both")
## Start: AIC=25027.99
## survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined + a_stage + progesterone_status + ln_tumor
##
## Df Sum of Sq RSS AIC
## <none> 2028556 25028
## - marital_status 4 4192.1 2032748 25028
## - a_stage 1 1306.7 2029862 25029
## - progesterone_status 1 1506.9 2030063 25029
## - sqrt_examined 1 1812.4 2030368 25030
## - ln_tumor 1 3395.3 2031951 25033
## - estrogen_status 1 14469.8 2043025 25054
## - n_stage 2 23667.3 2052223 25071
summary(aic_selected_num_model)
##
## Call:
## lm(formula = survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined + a_stage + progesterone_status + ln_tumor,
## data = newbc1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.517 -15.640 1.105 18.394 52.990
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.2697 3.9702 15.181 < 2e-16 ***
## marital_status2 -1.0335 1.1130 -0.929 0.35315
## marital_status3 -1.2632 1.0090 -1.252 0.21066
## marital_status4 -2.1402 1.5343 -1.395 0.16311
## marital_status5 -7.8496 3.3909 -2.315 0.02067 *
## n_stage2 -2.9220 0.9277 -3.150 0.00165 **
## n_stage3 -8.3923 1.2582 -6.670 2.91e-11 ***
## estrogen_status1 8.8846 1.6623 5.345 9.55e-08 ***
## sqrt_examined 2.2560 1.1926 1.892 0.05861 .
## a_stage1 4.0047 2.4934 1.606 0.10832
## progesterone_status1 1.8906 1.0961 1.725 0.08463 .
## ln_tumor -4.2533 1.6428 -2.589 0.00966 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.51 on 4005 degrees of freedom
## Multiple R-squared: 0.03925, Adjusted R-squared: 0.03661
## F-statistic: 14.87 on 11 and 4005 DF, p-value: < 2.2e-16
# BIC-based selection (which is similar to AIC but with a larger penalty for the number of parameters)
bic_selected_num_model <- stepAIC(lm_model, direction = "both", k = log(nrow(newbc1)))
## Start: AIC=25103.57
## survival_months ~ marital_status + n_stage + estrogen_status +
## sqrt_examined + a_stage + progesterone_status + ln_tumor
##
## Df Sum of Sq RSS AIC
## - marital_status 4 4192.1 2032748 25079
## - a_stage 1 1306.7 2029862 25098
## - progesterone_status 1 1506.9 2030063 25098
## - sqrt_examined 1 1812.4 2030368 25099
## - ln_tumor 1 3395.3 2031951 25102
## <none> 2028556 25104
## - estrogen_status 1 14469.8 2043025 25124
## - n_stage 2 23667.3 2052223 25134
##
## Step: AIC=25078.67
## survival_months ~ n_stage + estrogen_status + sqrt_examined +
## a_stage + progesterone_status + ln_tumor
##
## Df Sum of Sq RSS AIC
## - a_stage 1 1377.5 2034125 25073
## - progesterone_status 1 1701.5 2034449 25074
## - sqrt_examined 1 1910.2 2034658 25074
## - ln_tumor 1 3399.5 2036147 25077
## <none> 2032748 25079
## - estrogen_status 1 14532.1 2047280 25099
## + marital_status 4 4192.1 2028556 25104
## - n_stage 2 24643.1 2057391 25110
##
## Step: AIC=25073.1
## survival_months ~ n_stage + estrogen_status + sqrt_examined +
## progesterone_status + ln_tumor
##
## Df Sum of Sq RSS AIC
## - progesterone_status 1 1626.5 2035752 25068
## - sqrt_examined 1 1987.6 2036113 25069
## - ln_tumor 1 3579.3 2037705 25072
## <none> 2034125 25073
## + a_stage 1 1377.5 2032748 25079
## - estrogen_status 1 14980.2 2049105 25094
## + marital_status 4 4263.0 2029862 25098
## - n_stage 2 29815.4 2063941 25115
##
## Step: AIC=25068.01
## survival_months ~ n_stage + estrogen_status + sqrt_examined +
## ln_tumor
##
## Df Sum of Sq RSS AIC
## - sqrt_examined 1 2050.1 2037802 25064
## - ln_tumor 1 3662.4 2039414 25067
## <none> 2035752 25068
## + progesterone_status 1 1626.5 2034125 25073
## + a_stage 1 1302.5 2034449 25074
## + marital_status 4 4451.5 2031300 25092
## - n_stage 2 30573.5 2066325 25111
## - estrogen_status 1 27565.1 2063317 25114
##
## Step: AIC=25063.75
## survival_months ~ n_stage + estrogen_status + ln_tumor
##
## Df Sum of Sq RSS AIC
## - ln_tumor 1 3380.8 2041183 25062
## <none> 2037802 25064
## + sqrt_examined 1 2050.1 2035752 25068
## + progesterone_status 1 1689.0 2036113 25069
## + a_stage 1 1377.6 2036424 25069
## + marital_status 4 4560.8 2033241 25088
## - n_stage 2 28586.3 2066388 25103
## - estrogen_status 1 27469.9 2065272 25109
##
## Step: AIC=25062.11
## survival_months ~ n_stage + estrogen_status
##
## Df Sum of Sq RSS AIC
## <none> 2041183 25062
## + ln_tumor 1 3381 2037802 25064
## + sqrt_examined 1 1768 2039414 25067
## + progesterone_status 1 1766 2039417 25067
## + a_stage 1 1545 2039638 25067
## + marital_status 4 4567 2036616 25086
## - estrogen_status 1 27657 2068840 25108
## - n_stage 2 35418 2076601 25115
summary(bic_selected_num_model)
##
## Call:
## lm(formula = survival_months ~ n_stage + estrogen_status, data = newbc1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.709 -15.709 1.291 18.423 53.008
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.1554 1.4242 44.344 < 2e-16 ***
## n_stage2 -3.1318 0.8993 -3.483 0.000502 ***
## n_stage3 -9.1636 1.1309 -8.103 7.05e-16 ***
## estrogen_status1 10.5533 1.4312 7.374 2.00e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.55 on 4013 degrees of freedom
## Multiple R-squared: 0.03327, Adjusted R-squared: 0.03255
## F-statistic: 46.03 on 3 and 4013 DF, p-value: < 2.2e-16
According to the result, we can see that based on the AIC selection, the final model according to the survival_month is survival_months ~ marital_status + n_stage + estrogen_status + sqrt_examined + a_stage + progesterone_status + ln_tumor.
According to the BIC selection of the final result, the final model they selected is survival_months ~ n_stage + estrogen_status.
# For the binary outcome, we will use the AIC criterion as well
aic_selected_bin_model <- stepAIC(glmfit, direction = "both")
## Start: AIC=2997.39
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## a_stage + estrogen_status + progesterone_status + ln_tumor +
## sqrt_examined + reginol_node_positive + age
##
## Df Deviance AIC
## - a_stage 1 2953.4 2995.4
## - ln_tumor 1 2953.6 2995.6
## - marital_status 4 2961.2 2997.2
## <none> 2953.4 2997.4
## - t_stage 3 2964.7 3002.7
## - n_stage 2 2967.3 3007.3
## - race 2 2969.2 3009.2
## - estrogen_status 1 2970.5 3012.5
## - age 1 2971.4 3013.4
## - progesterone_status 1 2973.6 3015.6
## - reginol_node_positive 1 2978.6 3020.6
## - sqrt_examined 1 2980.4 3022.4
## - differentiate 3 2986.8 3024.8
##
## Step: AIC=2995.41
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + ln_tumor + sqrt_examined +
## reginol_node_positive + age
##
## Df Deviance AIC
## - ln_tumor 1 2953.6 2993.6
## - marital_status 4 2961.2 2995.2
## <none> 2953.4 2995.4
## + a_stage 1 2953.4 2997.4
## - t_stage 3 2966.0 3002.0
## - n_stage 2 2967.5 3005.5
## - race 2 2969.2 3007.2
## - estrogen_status 1 2970.6 3010.6
## - age 1 2971.4 3011.4
## - progesterone_status 1 2973.6 3013.6
## - reginol_node_positive 1 2978.6 3018.6
## - sqrt_examined 1 2980.4 3020.4
## - differentiate 3 2986.8 3022.8
##
## Step: AIC=2993.57
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + reginol_node_positive +
## age
##
## Df Deviance AIC
## - marital_status 4 2961.5 2993.5
## <none> 2953.6 2993.6
## + ln_tumor 1 2953.4 2995.4
## + a_stage 1 2953.6 2995.6
## - n_stage 2 2967.9 3003.9
## - race 2 2969.3 3005.3
## - estrogen_status 1 2970.7 3008.7
## - age 1 2971.4 3009.4
## - progesterone_status 1 2973.8 3011.8
## - reginol_node_positive 1 2978.9 3016.9
## - t_stage 3 2983.2 3017.2
## - sqrt_examined 1 2980.6 3018.6
## - differentiate 3 2987.3 3021.3
##
## Step: AIC=2993.48
## status ~ race + t_stage + n_stage + differentiate + estrogen_status +
## progesterone_status + sqrt_examined + reginol_node_positive +
## age
##
## Df Deviance AIC
## <none> 2961.5 2993.5
## + marital_status 4 2953.6 2993.6
## + ln_tumor 1 2961.2 2995.2
## + a_stage 1 2961.4 2995.4
## - n_stage 2 2975.7 3003.7
## - race 2 2980.3 3008.3
## - estrogen_status 1 2978.4 3008.4
## - age 1 2981.4 3011.4
## - progesterone_status 1 2982.6 3012.6
## - reginol_node_positive 1 2987.8 3017.8
## - t_stage 3 2992.1 3018.1
## - sqrt_examined 1 2989.1 3019.1
## - differentiate 3 2995.1 3021.1
summary(aic_selected_bin_model)
##
## Call:
## glm(formula = status ~ race + t_stage + n_stage + differentiate +
## estrogen_status + progesterone_status + sqrt_examined + reginol_node_positive +
## age, family = binomial, data = newbc2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.623139 0.371232 4.372 1.23e-05 ***
## race2 -0.574227 0.158413 -3.625 0.000289 ***
## race3 0.430601 0.201848 2.133 0.032901 *
## t_stage2 -0.429230 0.112954 -3.800 0.000145 ***
## t_stage3 -0.561433 0.148563 -3.779 0.000157 ***
## t_stage4 -1.122611 0.242936 -4.621 3.82e-06 ***
## n_stage2 -0.474258 0.128697 -3.685 0.000229 ***
## n_stage3 -0.644148 0.234160 -2.751 0.005943 **
## differentiate2 0.389412 0.104623 3.722 0.000198 ***
## differentiate3 0.908192 0.191600 4.740 2.14e-06 ***
## differentiate4 -0.954165 0.525800 -1.815 0.069571 .
## estrogen_status1 0.731594 0.176812 4.138 3.51e-05 ***
## progesterone_status1 0.600531 0.127305 4.717 2.39e-06 ***
## sqrt_examined 0.259110 0.049645 5.219 1.80e-07 ***
## reginol_node_positive -0.076301 0.014939 -5.107 3.27e-07 ***
## age -0.024052 0.005443 -4.419 9.90e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3444.7 on 4023 degrees of freedom
## Residual deviance: 2961.5 on 4008 degrees of freedom
## AIC: 2993.5
##
## Number of Fisher Scoring iterations: 5
# And the BIC criterion
bic_selected_bin_model <- stepAIC(glmfit, direction = "both", family = binomial, k = log(nrow(newbc2)))
## Start: AIC=3135.99
## status ~ race + marital_status + t_stage + n_stage + differentiate +
## a_stage + estrogen_status + progesterone_status + ln_tumor +
## sqrt_examined + reginol_node_positive + age
##
## Df Deviance AIC
## - marital_status 4 2961.2 3110.6
## - t_stage 3 2964.7 3122.4
## - a_stage 1 2953.4 3127.7
## - ln_tumor 1 2953.6 3127.8
## - n_stage 2 2967.3 3133.3
## - race 2 2969.2 3135.2
## <none> 2953.4 3136.0
## - differentiate 3 2986.8 3144.5
## - estrogen_status 1 2970.5 3144.8
## - age 1 2971.4 3145.7
## - progesterone_status 1 2973.6 3147.9
## - reginol_node_positive 1 2978.6 3152.9
## - sqrt_examined 1 2980.4 3154.7
##
## Step: AIC=3110.59
## status ~ race + t_stage + n_stage + differentiate + a_stage +
## estrogen_status + progesterone_status + ln_tumor + sqrt_examined +
## reginol_node_positive + age
##
## Df Deviance AIC
## - t_stage 3 2972.7 3097.2
## - a_stage 1 2961.2 3102.3
## - ln_tumor 1 2961.4 3102.5
## - n_stage 2 2974.9 3107.7
## <none> 2961.2 3110.6
## - race 2 2980.1 3112.9
## - differentiate 3 2994.5 3119.0
## - estrogen_status 1 2978.0 3119.1
## - age 1 2981.3 3122.4
## - progesterone_status 1 2982.3 3123.4
## - reginol_node_positive 1 2987.5 3128.6
## - sqrt_examined 1 2988.7 3129.8
## + marital_status 4 2953.4 3136.0
##
## Step: AIC=3097.15
## status ~ race + n_stage + differentiate + a_stage + estrogen_status +
## progesterone_status + ln_tumor + sqrt_examined + reginol_node_positive +
## age
##
## Df Deviance AIC
## - a_stage 1 2974.2 3090.4
## - n_stage 2 2987.1 3095.0
## <none> 2972.7 3097.2
## - race 2 2991.8 3099.7
## - estrogen_status 1 2989.9 3106.1
## - ln_tumor 1 2989.9 3106.1
## - differentiate 3 3007.4 3107.0
## - progesterone_status 1 2993.3 3109.5
## - age 1 2993.4 3109.7
## + t_stage 3 2961.2 3110.6
## - reginol_node_positive 1 2999.2 3115.4
## - sqrt_examined 1 2999.9 3116.1
## + marital_status 4 2964.7 3122.4
##
## Step: AIC=3090.36
## status ~ race + n_stage + differentiate + estrogen_status + progesterone_status +
## ln_tumor + sqrt_examined + reginol_node_positive + age
##
## Df Deviance AIC
## - n_stage 2 2989.3 3088.9
## <none> 2974.2 3090.4
## - race 2 2993.3 3092.9
## + a_stage 1 2972.7 3097.2
## - estrogen_status 1 2991.9 3099.8
## - differentiate 3 3008.5 3099.8
## - ln_tumor 1 2992.1 3100.0
## + t_stage 3 2961.2 3102.3
## - progesterone_status 1 2994.5 3102.4
## - age 1 2994.7 3102.6
## - reginol_node_positive 1 3000.4 3108.3
## - sqrt_examined 1 3001.7 3109.6
## + marital_status 4 2966.0 3115.4
##
## Step: AIC=3088.9
## status ~ race + differentiate + estrogen_status + progesterone_status +
## ln_tumor + sqrt_examined + reginol_node_positive + age
##
## Df Deviance AIC
## <none> 2989.3 3088.9
## + n_stage 2 2974.2 3090.4
## - race 2 3009.5 3092.5
## + a_stage 1 2987.1 3095.0
## - estrogen_status 1 3008.0 3099.3
## + t_stage 3 2975.1 3099.6
## - age 1 3009.9 3101.2
## - progesterone_status 1 3010.3 3101.6
## - differentiate 3 3027.0 3101.7
## - ln_tumor 1 3013.8 3105.1
## - sqrt_examined 1 3015.5 3106.8
## + marital_status 4 2981.4 3114.2
## - reginol_node_positive 1 3146.2 3237.5
summary(bic_selected_bin_model)
##
## Call:
## glm(formula = status ~ race + differentiate + estrogen_status +
## progesterone_status + ln_tumor + sqrt_examined + reginol_node_positive +
## age, family = binomial, data = newbc2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.451459 0.448634 5.464 4.65e-08 ***
## race2 -0.595023 0.157272 -3.783 0.000155 ***
## race3 0.440768 0.201932 2.183 0.029054 *
## differentiate2 0.417805 0.103868 4.022 5.76e-05 ***
## differentiate3 0.957010 0.190932 5.012 5.38e-07 ***
## differentiate4 -0.916981 0.506763 -1.809 0.070375 .
## estrogen_status1 0.763842 0.175498 4.352 1.35e-05 ***
## progesterone_status1 0.596058 0.126629 4.707 2.51e-06 ***
## ln_tumor -0.366801 0.075028 -4.889 1.01e-06 ***
## sqrt_examined 0.248487 0.048812 5.091 3.57e-07 ***
## reginol_node_positive -0.109570 0.008846 -12.386 < 2e-16 ***
## age -0.024373 0.005417 -4.500 6.80e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3444.7 on 4023 degrees of freedom
## Residual deviance: 2989.3 on 4012 degrees of freedom
## AIC: 3013.3
##
## Number of Fisher Scoring iterations: 5
In the binary model, the final model according to the AIC selection is status ~ race + t_stage + n_stage + differentiate + estrogen_status + progesterone_status + sqrt_examined + reginol_node_positive + age.
The BIC had the selection of model status ~ race + differentiate + estrogen_status + progesterone_status + ln_tumor + sqrt_examined + reginol_node_positive + age.
par(mfrow = c(3,4))
plot(aic_selected_num_model)
plot(bic_selected_num_model)
plot(nummodel_partial_3)
par(mfrow = c(3,4))
plot(aic_selected_bin_model)
plot(bic_selected_bin_model)
plot(binmodel_partial_1)
3.0 transformation edited 3.1 interaction transformation ?? 3.2 partial test 3.3 diagnostic boxcox ??? 之前不是用过了boxcox 4. Step-wise: forward/ backward /AIC 5. final model 6. model assumption (check multicolinearity (VIF)) 7. cross validation